import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# Load the fraud data
Data=pd.read_csv("C:/Users/Mus/Downloads/fraudTrain.csv")
# Show the first few rows of the dataset
print(Data.head())
Unnamed: 0 trans_date_trans_time cc_num \
0 0 2019-01-01 00:00:18 2703186189652095
1 1 2019-01-01 00:00:44 630423337322
2 2 2019-01-01 00:00:51 38859492057661
3 3 2019-01-01 00:01:16 3534093764340240
4 4 2019-01-01 00:03:06 375534208663984
merchant category amt first \
0 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer
1 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie
2 fraud_Lind-Buckridge entertainment 220.11 Edward
3 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy
4 fraud_Keeling-Crist misc_pos 41.96 Tyler
last gender street ... lat long \
0 Banks F 561 Perry Cove ... 36.0788 -81.1781
1 Gill F 43039 Riley Greens Suite 393 ... 48.8878 -118.2105
2 Sanchez M 594 White Dale Suite 530 ... 42.1808 -112.2620
3 White M 9443 Cynthia Court Apt. 038 ... 46.2306 -112.1138
4 Garcia M 408 Bradley Rest ... 38.4207 -79.4629
city_pop job dob \
0 3495 Psychologist, counselling 1988-03-09
1 149 Special educational needs teacher 1978-06-21
2 4154 Nature conservation officer 1962-01-19
3 1939 Patent attorney 1967-01-12
4 99 Dance movement psychotherapist 1986-03-28
trans_num unix_time merch_lat merch_long \
0 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315
1 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462
2 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481
3 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071
4 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459
is_fraud
0 0
1 0
2 0
3 0
4 0
[5 rows x 23 columns]
# Display the count of null values in each column of the DataFrame
print(Data.isnull().sum())
Unnamed: 0 0 trans_date_trans_time 0 cc_num 0 merchant 0 category 0 amt 0 first 0 last 0 gender 0 street 0 city 0 state 0 zip 0 lat 0 long 0 city_pop 0 job 0 dob 0 trans_num 0 unix_time 0 merch_lat 0 merch_long 0 is_fraud 0 dtype: int64
This analysis shows that the data has no missing values.
# Map the is_fraud values to "No" and "Yes"
Data["fraud_label"] = Data["is_fraud"].map({0: "No", 1: "Yes"})
#Count the number of fraudulent and non-fraudulent transactions
fraud_counts = Data["fraud_label"].value_counts()
# Create the donut chart
figure = px.pie(
values=fraud_counts.values,
names=fraud_counts.index,
hole=0.5,
title="Distribution of fraudulent vs non-fraudulent transactions"
)
# Afficher le diagramme
figure.show()
The data shows that 99.4% of the cases are non-fraudulent, whereas 0.6% are fraudulent.
# Filter the data to include only fraud cases
fraud_data = Data[Data["is_fraud"] == 1]
# Count the number of fraud cases per category
fraud_by_job = fraud_data["category"].value_counts()
# Create the bar chart
figure = px.bar(
fraud_by_job,
x=fraud_by_job.index,
y=fraud_by_job.values,
title="Number of fraud cases per Category",
labels={'x': "Category", 'y': "Number of fraud cases"},
color=fraud_by_job.values,
color_continuous_scale='Viridis'
)
# Show the chart
figure.show()
This graph illustrates the number of fraud cases across different types of transactions or expenses. It indicates that fraud is more prevalent in grocery purchases and online shopping, while fraud cases in travel-related transactions are comparatively lower.
# Count the number of fraud cases per gender
fraud_by_gender = fraud_data["gender"].value_counts()
# Create the pie chart
figure = px.pie(
fraud_by_gender,
values=fraud_by_gender.values,
names=fraud_by_gender.index,
title="Distribution of fraud cases by Gender"
)
# Show the chart
figure.show()
We observed that 50.2% of the fraud cases involve men, with the remaining percentage accounted for by women.
# Count the number of fraud cases per state
fraud_by_state = fraud_data["state"].value_counts()
# Create the bar chart
figure = px.bar(
fraud_by_state,
x=fraud_by_state.index,
y=fraud_by_state.values,
title="Number of fraud cases per State",
labels={'x': "State", 'y': "Number of fraud cases"},
color=fraud_by_state.values,
color_continuous_scale='Blues'
)
# Show the chart
figure.show()
The graph shows that New York (NW) has the highest number of fraud cases (555), followed by Texas (TX) with 479 cases and Pennsylvania (PA) with 458 cases. In contrast, Hawaii (HI) has the lowest number of fraud cases, totaling 7, which indicates a significantly lower incidence compared to the other states.
New York, Texas and Pennsylvania have the highest percentage of fraud cases, likely because of their large populations and high economic activity. As major economic centers with many transactions, they provide more opportunities for fraud. On the other hand, Hawaii, being less densely populated and economically isolated, naturally experiences fewer incidents of fraud.
# Ensure the date is in datetime format
Data['trans_date_trans_time'] = pd.to_datetime(Data['trans_date_trans_time'])
# Extract the day of the week from the transaction date
Data['day_of_week'] = Data['trans_date_trans_time'].dt.day_name()
# Count the number of fraud cases per day of the week
fraud_cases_per_day = Data[Data['is_fraud'] == 1]['day_of_week'].value_counts()
# Create a pie chart
figure = px.pie(
fraud_cases_per_day,
values=fraud_cases_per_day.values,
names=fraud_cases_per_day.index,
title="Number of fraud cases per Day of the week"
)
# Show the pie chart
figure.show()
Fraud cases are more frequent at the beginning and end of the week, with weekends showing particularly high incidence rates, followed by a decline throughout the rest of the week.
# Ensure the 'dob' column is in datetime format
Data['dob'] = pd.to_datetime(Data['dob'])
# Create a new 'age' column that calculates the difference between the current date and the date of birth
Data['age'] = Data['dob'].apply(lambda x: datetime.now().year - x.year - ((datetime.now().month, datetime.now().day) < (x.month, x.day)))
# Define the age bins
bins = [19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 99 ]
# Define the labels for these bins
labels = ['19-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69','70-74', '75-79', '80-84','85-89', '90-94', '95-99']
# Create a new column 'age_group' with the age groups
Data['age_group'] = pd.cut(Data['age'], bins=bins, labels=labels, right=True, include_lowest=True)
# Count the number of fraud cases by age group
fraud_by_age_group = Data[Data['is_fraud'] == 1].groupby('age_group').size()
# Plotting the results
ax = fraud_by_age_group.plot(kind='bar', color='skyblue')
for i in ax.containers:
ax.bar_label(i)
plt.title('Number of fraud cases by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Number of fraud Cases')
plt.xticks(rotation=45)
plt.show()
Fraud cases are most frequent among individuals aged 25 to 69, indicating that both older adults and younger middle-aged individuals are at significant risk. In contrast, younger and older age groups may be less susceptible to fraud.
# Ensure the date is in datetime format
Data['trans_date_trans_time'] = pd.to_datetime(Data['trans_date_trans_time'])
# Extract the hour from the transaction date
Data['hour'] = Data['trans_date_trans_time'].dt.hour
# Select the relevant variables for the correlation matrix
selected_data = Data[['hour', 'city_pop', 'amt', 'age']]
# Calculate the correlation matrix
correlation_matrix = selected_data.corr()
# Plot the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
The resulting heatmap indicates that these correlations are very low, suggesting that the variables amount, city population, age and hour of transaction are not strongly related to each other.
# Count the number of fraud cases per hour
fraud_by_hour = Data[Data['is_fraud'] == 1].groupby('hour').size().reset_index(name='fraud_count')
# Create the line plot
plt.figure(figsize=(10, 6))
plt.plot(fraud_by_hour['hour'], fraud_by_hour['fraud_count'], marker='o', linestyle='-', color='royalblue')
plt.fill_between(fraud_by_hour['hour'], fraud_by_hour['fraud_count'], color='royalblue', alpha=0.3)
# Customize the plot
plt.title('Fraud cases by Hour of the day')
plt.xlabel('Hour of the day')
plt.ylabel('Number of fraud cases')
plt.xticks(fraud_by_hour['hour'], [f"{x:02d}h" for x in fraud_by_hour['hour']])
plt.grid(True)
# Show the plot
plt.show()
The line plot shows that fraud cases are more frequent in the evening, likely due to reduced vigilance or monitoring. After 11 PM, there's a sharp decline in fraud cases.
# Initialize an empty dictionary to store the p-values
p_values = {}
# Perform Chi-Square Test for each variable and store the p-value
contingency_table1 = pd.crosstab(Data['gender'], Data['is_fraud'])
chi2, p, dof, expected = chi2_contingency(contingency_table1)
p_values['gender'] = p
contingency_table2 = pd.crosstab(Data['day_of_week'], Data['is_fraud'])
chi2, p, dof, expected = chi2_contingency(contingency_table2)
p_values['day_of_week'] = p
contingency_table3 = pd.crosstab(Data['age_group'], Data['is_fraud'])
chi2, p, dof, expected = chi2_contingency(contingency_table3)
p_values['age_group'] = p
# Print all p-values
for variable, p_value in p_values.items():
print(f"P-value for {variable}: {p_value}")
P-value for gender: 3.627211385830374e-18 P-value for day_of_week: 2.2877424359174457e-37 P-value for age_group: 3.481065950288757e-107
The results of the test indicate a statistically significant relationship between fraud and the variables : gender, age_group and day of the week.
print(Data)
Unnamed: 0 trans_date_trans_time cc_num \
0 0 2019-01-01 00:00:18 2703186189652095
1 1 2019-01-01 00:00:44 630423337322
2 2 2019-01-01 00:00:51 38859492057661
3 3 2019-01-01 00:01:16 3534093764340240
4 4 2019-01-01 00:03:06 375534208663984
... ... ... ...
1296670 1296670 2020-06-21 12:12:08 30263540414123
1296671 1296671 2020-06-21 12:12:19 6011149206456997
1296672 1296672 2020-06-21 12:12:32 3514865930894695
1296673 1296673 2020-06-21 12:13:36 2720012583106919
1296674 1296674 2020-06-21 12:13:37 4292902571056973207
merchant category amt \
0 fraud_Rippin, Kub and Mann misc_net 4.97
1 fraud_Heller, Gutmann and Zieme grocery_pos 107.23
2 fraud_Lind-Buckridge entertainment 220.11
3 fraud_Kutch, Hermiston and Farrell gas_transport 45.00
4 fraud_Keeling-Crist misc_pos 41.96
... ... ... ...
1296670 fraud_Reichel Inc entertainment 15.56
1296671 fraud_Abernathy and Sons food_dining 51.70
1296672 fraud_Stiedemann Ltd food_dining 105.93
1296673 fraud_Reinger, Weissnat and Strosin food_dining 74.90
1296674 fraud_Langosh, Wintheiser and Hyatt food_dining 4.30
first last gender street ... \
0 Jennifer Banks F 561 Perry Cove ...
1 Stephanie Gill F 43039 Riley Greens Suite 393 ...
2 Edward Sanchez M 594 White Dale Suite 530 ...
3 Jeremy White M 9443 Cynthia Court Apt. 038 ...
4 Tyler Garcia M 408 Bradley Rest ...
... ... ... ... ... ...
1296670 Erik Patterson M 162 Jessica Row Apt. 072 ...
1296671 Jeffrey White M 8617 Holmes Terrace Suite 651 ...
1296672 Christopher Castaneda M 1632 Cohen Drive Suite 639 ...
1296673 Joseph Murray M 42933 Ryan Underpass ...
1296674 Jeffrey Smith M 135 Joseph Mountains ...
trans_num unix_time merch_lat merch_long \
0 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315
1 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462
2 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481
3 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071
4 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459
... ... ... ... ...
1296670 440b587732da4dc1a6395aba5fb41669 1371816728 36.841266 -111.690765
1296671 278000d2e0d2277d1de2f890067dcc0a 1371816739 38.906881 -78.246528
1296672 483f52fe67fabef353d552c1e662974c 1371816752 33.619513 -105.130529
1296673 d667cdcbadaaed3da3f4020e83591c83 1371816816 42.788940 -103.241160
1296674 8f7c8e4ab7f25875d753b422917c98c9 1371816817 46.565983 -114.186110
is_fraud fraud_label day_of_week age age_group hour
0 0 No Tuesday 36 35-39 0
1 0 No Tuesday 46 45-49 0
2 0 No Tuesday 62 60-64 0
3 0 No Tuesday 57 55-59 0
4 0 No Tuesday 38 35-39 0
... ... ... ... .. ... ...
1296670 0 No Sunday 62 60-64 12
1296671 0 No Sunday 44 40-44 12
1296672 0 No Sunday 56 55-59 12
1296673 0 No Sunday 44 40-44 12
1296674 0 No Sunday 29 25-29 12
[1296675 rows x 28 columns]
useless_cols = [
'Unnamed: 0',
'merchant',
'cc_num',
'first',
'zip',
'last',
'trans_num',
'unix_time',
'street',
'merch_lat',
'merch_long',
'job',
'trans_date_trans_time',
'age_group',
'dob',
'lat',
'long',
'city',
'fraud_label',
'state'
]
# Dropping the columns
Data.drop(columns=useless_cols, inplace=True)
Data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1296675 entries, 0 to 1296674 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 category 1296675 non-null object 1 amt 1296675 non-null float64 2 gender 1296675 non-null object 3 city_pop 1296675 non-null int64 4 is_fraud 1296675 non-null int64 5 day_of_week 1296675 non-null object 6 age 1296675 non-null int64 7 hour 1296675 non-null int32 dtypes: float64(1), int32(1), int64(3), object(3) memory usage: 74.2+ MB
# Identify numeric columns in the dataset
numeric_columns = [i for i in Data.columns if Data[i].dtype == 'int64' or Data[i].dtype =='int32' or Data[i].dtype =='float64']
# Identify categorical columns in the dataset
categorical_columns = [i for i in Data.columns if Data[i].dtype == 'object']
print("Numeric Columns:\n", numeric_columns)
print("\nCategorical Columns:\n", categorical_columns)
Numeric Columns: ['amt', 'city_pop', 'is_fraud', 'age', 'hour'] Categorical Columns: ['category', 'gender', 'day_of_week']
# Encode categorical columns to prepare the data for modeling
Encoder = LabelEncoder()
for col in categorical_columns:
Data[col] = Encoder.fit_transform(Data[col])
print(Data)
category amt gender city_pop is_fraud day_of_week age hour 0 8 4.97 0 3495 0 5 36 0 1 4 107.23 0 149 0 5 46 0 2 0 220.11 1 4154 0 5 62 0 3 2 45.00 1 1939 0 5 57 0 4 9 41.96 1 99 0 5 38 0 ... ... ... ... ... ... ... ... ... 1296670 0 15.56 1 258 0 3 62 12 1296671 1 51.70 1 100 0 3 44 12 1296672 1 105.93 1 899 0 3 56 12 1296673 1 74.90 1 1126 0 3 44 12 1296674 1 4.30 1 218 0 3 29 12 [1296675 rows x 8 columns]
# Separate the dataset into non-fraud and fraud classes
non_fraud_class = Data[Data['is_fraud'] == 0]
fraud_class = Data[Data['is_fraud'] == 1]
non_fraud_count,fraud_count=Data['is_fraud'].value_counts()
print("Le nombre d'observations dans non_fraud_class :", non_fraud_count)
print("Le nombre d'observations dans fraud_class :", fraud_count)
Le nombre d'observations dans non_fraud_class : 1289169 Le nombre d'observations dans fraud_class : 7506
We observe that the non-fraud class is significantly overrepresented compared to the fraud class. To address this imbalance, undersampling is applied to balance the dataset before modeling.
# Randomly sample from the non-fraud class to match the number of fraud cases (undersampling)
non_fraud_under = non_fraud_class.sample(fraud_count)
# Combine the undersampled non-fraud cases with the fraud cases to create a balanced dataset
under_sampled = pd.concat([non_fraud_under, fraud_class], axis=0)
print(under_sampled)
category amt gender city_pop is_fraud day_of_week age hour 1267150 8 129.69 0 6469 0 6 45 7 69280 7 50.01 1 140 0 3 34 21 621809 6 35.00 1 798 0 2 98 19 557749 0 96.47 1 33804 0 3 33 22 715748 4 124.16 1 1762 0 2 62 5 ... ... ... ... ... ... ... ... ... 1295399 11 977.01 0 105638 1 3 38 1 1295491 11 1210.91 0 105638 1 3 38 1 1295532 2 10.24 1 71335 1 3 30 2 1295666 2 21.69 0 23 1 3 54 3 1295733 2 10.20 1 71335 1 3 30 3 [15012 rows x 8 columns]
# Separate features and target variable from the undersampled dataset
X_under=under_sampled.drop('is_fraud',axis=1)
y_under = under_sampled['is_fraud']
# Get a list of column names from the undersampled dataset, excluding 'is_fraud'
columns = under_sampled.columns.tolist()
columns.remove('is_fraud')
# Initialize the StandardScaler
scaler = StandardScaler()
# Standardize the numeric features by scaling them
X_under[columns] = scaler.fit_transform(X_under[columns])
under_sampled[X_under.columns] = X_under
under_sampled
| category | amt | gender | city_pop | is_fraud | day_of_week | age | hour | |
|---|---|---|---|---|---|---|---|---|
| 1267150 | 0.319504 | -0.457529 | -0.963624 | -0.275034 | 0 | 1.683560 | -0.376063 | -0.759402 |
| 69280 | 0.062683 | -0.672017 | 1.037749 | -0.294870 | 0 | 0.104124 | -0.978567 | 0.902859 |
| 621809 | -0.194139 | -0.712422 | 1.037749 | -0.292808 | 0 | -0.422354 | 2.526910 | 0.665393 |
| 557749 | -1.735068 | -0.546953 | 1.037749 | -0.189361 | 0 | 0.104124 | -1.033340 | 1.021592 |
| 715748 | -0.707782 | -0.472415 | 1.037749 | -0.289786 | 0 | -0.422354 | 0.555079 | -0.996868 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1295399 | 1.089969 | 1.823340 | -0.963624 | 0.035779 | 1 | 0.104124 | -0.759475 | -1.471800 |
| 1295491 | 1.089969 | 2.452967 | -0.963624 | 0.035779 | 1 | 0.104124 | -0.759475 | -1.471800 |
| 1295532 | -1.221425 | -0.779072 | 1.037749 | -0.071732 | 1 | 0.104124 | -1.197659 | -1.353067 |
| 1295666 | -1.221425 | -0.748250 | -0.963624 | -0.295237 | 1 | 0.104124 | 0.116894 | -1.234334 |
| 1295733 | -1.221425 | -0.779180 | 1.037749 | -0.071732 | 1 | 0.104124 | -1.197659 | -1.234334 |
15012 rows × 8 columns
# Split the undersampled dataset into training and testing sets
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, random_state=0)
# Initialize the models
models = {
'Logistic Regression': LogisticRegression(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'Gradient Boosting': GradientBoostingClassifier(),
'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
'SVC': SVC(),
'KNN': KNeighborsClassifier(),
'Naive Bayes': GaussianNB(),
'AdaBoost': AdaBoostClassifier()
}
# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
score_f1 = f1_score(y_test, y_pred)
return accuracy, precision, recall, score_f1
# Evaluate each model
results = {}
for name, model in models.items():
accuracy, precision, recall, score_f1 = evaluate_model(model, X_train_under, y_train_under, X_test_under, y_test_under)
results[name] = {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1 Score': score_f1
}
# Create a DataFrame to display the results
results_df = pd.DataFrame(results).T
results_df
C:\Users\Mus\AppData\Local\anaconda3\Lib\site-packages\sklearn\ensemble\_weight_boosting.py:527: FutureWarning: The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.
| Accuracy | Precision | Recall | F1 Score | |
|---|---|---|---|---|
| Logistic Regression | 0.852918 | 0.945502 | 0.754322 | 0.839161 |
| Decision Tree | 0.965627 | 0.967437 | 0.964903 | 0.966168 |
| Random Forest | 0.971756 | 0.975726 | 0.968570 | 0.972135 |
| Gradient Boosting | 0.962963 | 0.970244 | 0.956522 | 0.963334 |
| XGBoost | 0.976286 | 0.975444 | 0.977999 | 0.976720 |
| SVC | 0.867573 | 0.914806 | 0.815610 | 0.862365 |
| KNN | 0.895817 | 0.910270 | 0.882137 | 0.895983 |
| Naive Bayes | 0.788702 | 0.952188 | 0.615506 | 0.747693 |
| AdaBoost | 0.937117 | 0.947087 | 0.928235 | 0.937566 |
After developing and evaluating 9 models, the XGBoost model proved to be the most effective, achieving an F1-Score of 97.6%. This indicates that XGBoost is highly accurate in identifying fraud cases. The Random Forest model was a close second, with an F1-Score of 97.2%.
Now, let's take a closer look at the XGBoost model.
# XGboost Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# Train the model on the training data
xgb_model.fit(X_train_under, y_train_under)
# Print confusion matrix and classification report
# Predict on the test set
y_pred = xgb_model.predict(X_test_under)
conf_matrix_xgb = confusion_matrix(y_test_under, y_pred)
print('Confusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_report(y_test_under, y_pred))
Confusion Matrix:
[[1797 47]
[ 42 1867]]
Classification Report:
precision recall f1-score support
0 0.98 0.97 0.98 1844
1 0.98 0.98 0.98 1909
accuracy 0.98 3753
macro avg 0.98 0.98 0.98 3753
weighted avg 0.98 0.98 0.98 3753
To detect fraud, we can employ metrics such as total cost using a cost-sensitive matrix. By calculating the total cost, we can identify the model that minimizes overall financial and operational losses.
# Define the different costs
# Cost for false positives
Cost_FP = 2
# Cost for false negatives; assumes it varies based on the 'amount' column
Cost_FN = Data_sampled['amt']
# Cost for true positives
Cost_TP = 2
# Cost for true negatives
Cost_TN = 0
# Define a function to compute confusion matrix
def get_confusion_matrix(model, X_test, y_test):
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
return conf_matrix
# Evaluate each model and compute the confusion matrix
confusion_matrices = {}
for name, model in models.items():
conf_matrix = get_confusion_matrix(model, X_test_under, y_test_under)
confusion_matrices[name] = conf_matrix
# Create a DataFrame to display confusion matrices
confusion_df = pd.DataFrame({
'Model': list(confusion_matrices.keys()),
'Confusion Matrix': [confusion_matrices[name] for name in confusion_matrices]
})
print(confusion_df)
Model Confusion Matrix 0 Logistic Regression [[1761, 83], [469, 1440]] 1 Decision Tree [[1782, 62], [67, 1842]] 2 Random Forest [[1798, 46], [60, 1849]] 3 Gradient Boosting [[1788, 56], [83, 1826]] 4 XGBoost [[1797, 47], [42, 1867]] 5 SVC [[1699, 145], [352, 1557]] 6 KNN [[1678, 166], [225, 1684]] 7 Naive Bayes [[1785, 59], [734, 1175]] 8 AdaBoost [[1745, 99], [137, 1772]]
# Define a function to calculate the total cost based on the confusion matrix and associated costs
def calculate_cost(conf_matrix, Cost_FP, Cost_FN, Cost_TP, Cost_TN):
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
TN = conf_matrix[0][0]
# Calculate the total cost
total_cost = FP * Cost_FP + FN * np.mean(Cost_FN) + TP * Cost_TP + FN * Cost_TN
return total_cost
# Calculate and store costs for each model
costs = {}
for name, conf_matrix in confusion_matrices.items():
total_cost = calculate_cost(conf_matrix, Cost_FP, Cost_FN, Cost_TP, Cost_TN)
costs[name] = total_cost
# Create a DataFrame to display the model names and their corresponding costs
costs_df = pd.DataFrame({
'Model': list(costs.keys()),
'Total Cost': [round(cost, 2) for cost in costs.values()]
})
print(costs_df)
Model Total Cost 0 Logistic Regression 36081.56 1 Decision Tree 8527.37 2 Random Forest 8016.30 3 Gradient Boosting 9610.38 4 XGBoost 6786.41 5 SVC 28198.28 6 KNN 19548.61 7 Naive Bayes 54169.70 8 AdaBoost 13392.05
The model with a high F1 Score is generally expected to have a low total cost, and this is what we have in our results, Naive Bayes has the highest total cost and XGBoost has the lowest.
We can also use Saving Score as a metric to evaluate cost improvements.
# Find the reference cost
# Note: In this case, the highest cost among all models is used as the reference.
reference_cost = max(costs.values())
# Calculate saving score percentage for each model
saving_score_percent = {model: ((reference_cost - cost) / reference_cost) * 100 for model, cost in costs.items()}
# Create a DataFrame to display saving score percentage
saving_score_df = pd.DataFrame({
'Model': list(saving_score_percent.keys()),
'Saving Score (%)': [round(saving) for saving in saving_score_percent.values()]
})
print(saving_score_df)
Model Saving Score (%) 0 Logistic Regression 33 1 Decision Tree 84 2 Random Forest 85 3 Gradient Boosting 82 4 XGBoost 87 5 SVC 48 6 KNN 64 7 Naive Bayes 0 8 AdaBoost 75
The XGBoost model achieves the highest saving score at 87%, followed by Random Forest and Decision Tree, with saving scores of 85% and 84%, respectively.
In summary, the XGBoost model has a high saving score, indicating that it is more effective at reducing the costs associated with fraud detection.